import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

# 导入items
from scrapy_dushuwang.items import ScrapyDushuwangItem


class ReadSpider(CrawlSpider):
    name = 'read'
    allowed_domains = ['www.dushu.com']
    start_urls = ['https://www.dushu.com/book/1175_1.html']

    rules = (
        Rule(LinkExtractor(
                allow=r'/book/1175_\d+\.html'),
                callback='parse_item',
                follow=True), # 注意：follow:True 持续跟进(读取所有分页)；False 不跟进(只读取当前页面中的分页)；
    )

    def parse_item(self, response):
        print('++++++++++++++++++++++++++++', response.url)
        # //div[@class="book-info"]//img/@alt
        # //div[@class="book-info"]//img/@data-original
        img_list = response.xpath('//div[@class="book-info"]//img')
        for img in img_list:
            name = img.xpath('./@alt').extract_first()
            src = img.xpath('./@data-original').extract_first()

            # items 组装数据结构
            data = ScrapyDushuwangItem(name=name, src=src)
            yield data
